Question 1 (Page 117.7)

Write a function kern_density and visually test how this performs for some hand constructed datasets and bandwidths

#Create the Epanechnikov kernel function with bandwidth = 1
epan_kernel <-function(x,h=1){
  x <-x/h
  a <- as.numeric(abs(x) <= 1)
  value <- (3/4) * ( 1 - x^2 ) * a
  return(value)
}

#Create the kernel density function.
#x: training vector 
#x_new: test set
#h: bandwidth 
#return kernel density estimate 

h = 1
kern_density <- function(x, h, x_new){
  sapply(x_new, function(k){
    estimate <-mean(epan_kernel(k-x,h))/h
    return(estimate)
  })
  }

# create a list of bandwidth for testing 
h = c(0.01,0.1,0.5,1,2)

# hand construct a testing dataset 
set.seed(666)
x <- rnorm(2000, 0, 1)
x_new <- sort(rnorm(100, 0, 1))

# visually test the function with different bandwidth 
for (i in h){
  plot(x_new, kern_density(x,i,x_new), xlab = "x",ylab = "Kernel Density", main = paste("Kernel Density Estimates for bandwidth =", i),type="l",col="orange")  
}

From the plots we can see that as the bandwidth becomes larger, the kernel estimate becomes smoother.

Question 2 (Page 200.3)

Question 3 (Page 200.4)

Question 4 (Page 200.5)

Question 5 (Page 200.6)

Check KKT conditions for glmnet

#Check current KKT conditions for regression vector (reference to Textbook page 189)

# Args:
#     X: A numeric data matrix.
#     y: Response vector.
#     b: Current value of the regression vector.
#     lambda: The penalty term.
#
# Returns:
#     A logical vector indicating where the KKT conditions have
#     been violated by the variables that are currently zero.

casl_lenet_check_kkt <- function(X, y, b, lambda) {
  
  resids <- y - X %*% b 
  s <- apply(X, 2, function(xj) crossprod(xj, resids)) /
    lambda / nrow(X)
  # return a vector indicating where the KKT conditions have been
  # violated by the variables that are currently zero 
  (b == 0) & (abs(s) >=1)
}

Implement lasso_reg_with_screening function (set alpha to 1)

#install.packages("glmnet")
library("glmnet")
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.4.4
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
## Loaded glmnet 2.0-16
#Check variables to see if they violate KKT conditions

# Args:
#     x:numeric values of a data matrix
#     y:response vector
#     
# Returns:
#     A logical vector indicating whether the KKT conditions have been violated
#     If KKT conditions are violated, return a vector that contains at least one TRUE
#     If KKT conditions are not violated, return a vector of FALSEs

lasso_reg_with_screening <- function(x, y){
  est <- cv.glmnet(x,y,alpha=1)
  lambda <- est$lambda.1se
  b <- est$glmnet.fit$beta[,est$lambda == lambda]
  print(b)
  casl_lenet_check_kkt(x, y, b, lambda)
}

Test on the Iris dataset

data("iris")
x <- scale(model.matrix(Sepal.Length ~. -1, iris))
y <- iris[,1]
lasso_reg_with_screening(x, y)
##       Sepal.Width      Petal.Length       Petal.Width     Speciessetosa 
##        0.23573670        1.03342444       -0.09390900        0.09750230 
## Speciesversicolor  Speciesvirginica 
##        0.00000000       -0.05469309
##       Sepal.Width      Petal.Length       Petal.Width     Speciessetosa 
##             FALSE             FALSE             FALSE             FALSE 
## Speciesversicolor  Speciesvirginica 
##             FALSE             FALSE

We get FALSE for all coefficient estimates. This indicates that no KKT conditions have been violated.